# analytics libraries installed listed in the kaggle/python docker image:
# Input data files are available in the "../input/" directory.
#from subprocess import check_output
#print(check_output(["ls", "../input"]).decode("utf8"))
# Any results you write to the current directory are saved as output.
import csv
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt'ggplot')
import pylab
import seaborn as sns
from IPython.core.display import display, HTML
data = pd.read_csv("../input/Iris.csv", header = 0)
#reset index
data = data.reset_index()
species_list = list(data["Species"].unique())
print("Types of species: %s\n" % species_list)
print("Dataset length: %i\n" % len(data))
print("Sepal length range: [%s, %s]" % (min(data["SepalLengthCm"]), max(data["SepalLengthCm"])))
print("Sepal width range: [%s, %s]" % (min(data["SepalWidthCm"]), max(data["SepalLengthCm"])))
print("Petal length range: [%s, %s]" % (min(data["PetalLengthCm"]), max(data["PetalLengthCm"])))
print("Petal width range: [%s, %s]\n" % (min(data["PetalWidthCm"]), max(data["PetalWidthCm"])))
print("Sepal length variance:\t %f" % np.var(data["SepalLengthCm"]))
print("Sepal width variance: \t %f" % np.var(data["SepalWidthCm"]))
print("Petal length variance:\t %f" % np.var(data["PetalLengthCm"]))
print("Petal width variance: \t %f\n" % np.var(data["PetalWidthCm"]))
print("Sepal length stddev:\t %f" % np.std(data["SepalLengthCm"]))
print("Sepal width stddev: \t %f" % np.std(data["SepalWidthCm"]))
print("Petal length stddev:\t %f" % np.std(data["PetalLengthCm"]))
print("Petal width stddev: \t %f\n" % np.std(data["PetalWidthCm"]))
print("Data describe\n---")
3 types of species
Relatively small dataset
# data.hist calls data.plot
# pandas.DataFrame.plot() returns a matplotlib axis
column=["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm", "Species"],
figsize=(10, 10)
#,sharey=True, sharex=True
pylab.suptitle("Analyzing distribution for the series", fontsize="xx-large")
#plt.subplot(2,3,1) # if using subplot
#plt.title('your title')
At first sight, Petal length and petal width seem to diverge from the normal distribution.
import scipy.stats as stats
#print("Sepal length variance:\t %f" % np.var(data["SepalLengthCm"]))
#print("Sepal width variance: \t %f" % np.var(data["SepalWidthCm"]))
#print("Petal length variance:\t %f" % np.var(data["PetalLengthCm"]))
#print("Petal width variance: \t %f\n" % np.var(data["PetalWidthCm"]))
for param in ["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"]:
z, pval = stats.normaltest(data[param])
if(pval < 0.055):
print("%s has a p-value of %f - distribution is not normal" % (param, pval))
print("%s has a p-value of %f" % (param, pval))
Hypothesis has been confirmed. Why ?
display(HTML('<h1>Analyzing the ' +
'<a href="">' +
'Pearson correlation coefficient</a></h1>'))
# data without the indexes
dt = data[data.columns[2:]]
# method : {‘pearson’, ‘kendall’, ‘spearman’}
corr = dt.corr(method="pearson") #returns a dataframe, so it can be reused
# eliminate upper triangle for readability
bool_upper_matrix = np.tril(np.ones(corr.shape)).astype(np.bool)
corr = corr.where(bool_upper_matrix)
# alternate method:
# seaborn matrix here
#sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
# square=True, ax=ax)
sns.heatmap(corr, cmap=sns.diverging_palette(220, 10, as_cmap=True),
Diagonal values and upper triangle are ignored (melted the upper triangle through np.tril and df.where).
Naturally, we find:
As such, we observe correlations between these main attributes: PetalWidth, PetalLength and SepalLength.
PCC is:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
X = [data["PetalWidthCm"], data["PetalLengthCm"]]
n = 100
ax.scatter(data["PetalWidthCm"], data["PetalLengthCm"], data["SepalLengthCm"])
data[data.columns[2:3]] #x=data["Index"], y=data["PetalLengthCm"]
from sklearn import linear_model
#pd.scatter_matrix(dt, alpha = 0.3, figsize = (14,8), diagonal = 'kde');
display(HTML('<h1>Scatterplots for the correlating pairs</h1>'))
dt.plot(kind='scatter', x='PetalWidthCm', y='PetalLengthCm');
dt.plot(kind='scatter', x='PetalLengthCm', y='SepalLengthCm');
dt.plot(kind='scatter', x='PetalWidthCm', y='SepalLengthCm');
# --- linear regreesion visualization
# TODO: random selection method from sklearn
#top_corr_x_train = data["PetalWidthCm"][0:75]
#top_corr_y_train = data["PetalLengthCm"][0:75]
#top_corr_x_test = data["PetalWidthCm"][75:]
#top_corr_y_test = data["PetalLengthCm"][75:]
#regr = linear_model.LinearRegression()
#, top_corr_y_train)
## The coefficients
##print('Coefficients: \n', regr.coef_)
## The mean squared error
#print("Mean squared error: %.2f"
# % np.mean((regr.predict(top_corr_x_test) - top_corr_y_test) ** 2))
## Explained variance score: 1 is perfect prediction
#print('Variance score: %.2f' % regr.score(top_corr_x_test, top_corr_y_test))
#plt.plot(top_corr_x_test, regr.predict(top_corr_x_test), color='blue',
# linewidth=3)
#prediction = regr.predict(top_corr_x_test)
##prediction = prediction[:]
#print("Length: " + len(top_corr_x_test))
from sklearn import neighbors, datasets
from matplotlib.colors import ListedColormap
import math
import random
from numpy.random import permutation
data_spl = data[data.columns[2:6]]
random_indices = permutation(data_spl.index)
# Set a cutoff for how many items we want in the test set (in this case 1/3 of the items)
test_cutoff = math.floor(len(data_spl)/3)
# Generate the test set by taking the first 1/3 of the randomly shuffled indices.
test = data_spl.loc[random_indices[1:test_cutoff]]
# Generate the train set with the rest of the data.
train = data_spl.loc[random_indices[test_cutoff:]]
def predictKNN(train,labels,test, n_neighbors = 2):
print("start knn")
knn = neighbors.KNeighborsClassifier(), labels)
probabilities = knn.predict_proba(test)
predictions = knn.predict(test)
bestScores = probabilities.max(axis=1)
print("done with knn")
return predictions, bestScores
data_sk = np.array(data)
# import some data to play with
#eiris = datasets.load_iris()
X = [data["PetalWidthCm"], data["PetalLengthCm"]]
y = ["PetalWidthCm", "PetalLengthCm"] #["PetalWidthCm", "PetalLengthCm"]
X = [np.array(data["PetalWidthCm"]), np.array(data["PetalLengthCm"])]
#data.columns = range(data.shape[1])
X = np.array(data[data.columns[2:4]])#.astype(np.float)
#X = data.columns[2:6]
Y = np.array(data[data.columns[0:1]]).ravel() #.T
# h = .02 # step size in the mesh
# # Create color maps
# cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
# cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
# for weights in ['uniform', 'distance']:
# # Plot the decision boundary. For that, we will assign a color to each
# # point in the mesh [x_min, x_max]x[y_min, y_max].
# x_min = min(X[0]) - 1 #X[0].min() - 1 #min(X[0]) - 1
# x_max = max(X[0]) + 1
# y_min = min(X[1]) - 1
# y_max = max(X[1]) + 1
# xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
# np.arange(y_min, y_max, h))
# #test = np.c_[xx.ravel(), yy.ravel()]
# #clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
#, y)
# Z, scores = predictKNN(X,y,test)
# #Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# # Put the result into a color plot
# Z = Z.reshape(xx.shape)
# plt.figure()
# plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# # Plot also the training points
# plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
# plt.xlim(xx.min(), xx.max())
# plt.ylim(yy.min(), yy.max())
# plt.title("3-Class classification (k = %i, weights = '%s')"
# % (n_neighbors, weights))
# import some data to play with
iris = datasets.load_iris()
X =[:, :2] # we only take the first two features.
Y =
# print(X)
# print(Y)
# print(np.bincount(Y, minlength=np.size(Y)))
In [13]:
h = .02 # step size in the mesh
# we create an instance of Neighbours Classifier and fit the data., Y)
# Plot the decision boundary. For that, we will asign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min, x_max = X[:,0].min() - .5, X[:,0].max() + .5
y_min, y_max = X[:,1].min() - .5, X[:,1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(4, 3))
plt.pcolormesh(xx, yy, Z)
# Plot also the training points
plt.scatter(X[:,0], X[:,1],c=Y )
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
